{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dd3c16ee-6929-4ecf-a442-9555d0b97c03",
   "metadata": {},
   "source": [
    "#### Objective:- Clean and save needed attributes and create table for generating migration points."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "7f7bc3c9-c297-4e02-9d4f-3e27d2223492",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import geopandas as gpd\n",
    "import ast,os,random\n",
    "pd.set_option('display.float_format','{:.1f}'.format)\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "import cudf, cupy as cp\n",
    "import numpy as np\n",
    "import time\n",
    "import math\n",
    "import pickle\n",
    "# pd.set_option('display.max_colwidth', -1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a5b2e784-179d-404f-be48-c2d84bbcf9a7",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c9e38991-3723-4edb-a4b4-f890f24bd85f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('data/mapped_blocks_full.csv',encoding='unicode_escape',usecols=['ID20','STATE','COUNTY','P20','eq_P10'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b5dbead9-dd67-409a-b81e-7c8dd69c97cc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "334735155"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.P20.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2f0bafbd-5a2e-4b86-8ac9-fd2a83a3889a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.COUNTY.replace({r'[^\\x00-\\x7F]+':''},regex=True,inplace=True)\n",
    "df.COUNTY.replace({r'([A-Z][a-z]+)([A-Z]+)':r'\\1'},regex=True,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "4d82f1b8-e111-463c-8148-82238a2098d5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8174955"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b813f68f-448a-4a97-af82-cbca5f15266b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID20</th>\n",
       "      <th>STATE</th>\n",
       "      <th>COUNTY</th>\n",
       "      <th>P20</th>\n",
       "      <th>eq_P10</th>\n",
       "      <th>block_diff</th>\n",
       "      <th>block_net</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>21</td>\n",
       "      <td>30.5</td>\n",
       "      <td>-10.0</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10010201001001</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>34</td>\n",
       "      <td>30.5</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10010201001002</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>29</td>\n",
       "      <td>51.8</td>\n",
       "      <td>-23.0</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10010201001003</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>17</td>\n",
       "      <td>13.3</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10010201001004</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             ID20  STATE          COUNTY  P20  eq_P10  block_diff  block_net\n",
       "0  10010201001000      1  Autauga County   21    30.5       -10.0         -1\n",
       "1  10010201001001      1  Autauga County   34    30.5         4.0          1\n",
       "2  10010201001002      1  Autauga County   29    51.8       -23.0         -1\n",
       "3  10010201001003      1  Autauga County   17    13.3         4.0          1\n",
       "4  10010201001004      1  Autauga County    0     0.0         0.0          0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['block_diff'] = df['P20'] - df['eq_P10']\n",
    "df['block_diff'] = df['block_diff'].round()\n",
    "df['block_net'] = df['block_diff'].apply(lambda x: 1 if x>0 else ( -1 if x<0 else 0))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "99f14f0c-35b0-4a6f-8d4f-b5286ef6c9e7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID20</th>\n",
       "      <th>STATE</th>\n",
       "      <th>COUNTY</th>\n",
       "      <th>P20</th>\n",
       "      <th>eq_P10</th>\n",
       "      <th>block_diff</th>\n",
       "      <th>block_net</th>\n",
       "      <th>error</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>21</td>\n",
       "      <td>30.0</td>\n",
       "      <td>-10.0</td>\n",
       "      <td>-1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10010201001001</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>34</td>\n",
       "      <td>30.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10010201001002</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>29</td>\n",
       "      <td>52.0</td>\n",
       "      <td>-23.0</td>\n",
       "      <td>-1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10010201001003</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>17</td>\n",
       "      <td>13.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10010201001004</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             ID20  STATE          COUNTY  P20  eq_P10  block_diff  block_net  \\\n",
       "0  10010201001000      1  Autauga County   21    30.0       -10.0         -1   \n",
       "1  10010201001001      1  Autauga County   34    30.0         4.0          1   \n",
       "2  10010201001002      1  Autauga County   29    52.0       -23.0         -1   \n",
       "3  10010201001003      1  Autauga County   17    13.0         4.0          1   \n",
       "4  10010201001004      1  Autauga County    0     0.0         0.0          0   \n",
       "\n",
       "   error  \n",
       "0    1.0  \n",
       "1    0.0  \n",
       "2    0.0  \n",
       "3    0.0  \n",
       "4    0.0  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['eq_P10'] = df['eq_P10'].round()\n",
    "df['error'] = (df['P20']-df['eq_P10']) - df['block_diff']\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "85d04ec7-c40a-491f-b5ac-dbdddfa087c4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID20</th>\n",
       "      <th>STATE</th>\n",
       "      <th>COUNTY</th>\n",
       "      <th>P20</th>\n",
       "      <th>eq_P10</th>\n",
       "      <th>block_diff</th>\n",
       "      <th>block_net</th>\n",
       "      <th>error</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [ID20, STATE, COUNTY, P20, eq_P10, block_diff, block_net, error]\n",
       "Index: []"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['eq_P10'] = df['eq_P10'] + df['error']\n",
    "df[(df['P20']-df['eq_P10'])!=(df['block_diff'])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "eca29aec-38f6-45ec-a64c-a67582fd79ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[['ID20','COUNTY','P20','eq_P10','block_diff','block_net']].to_parquet('data/total_attr_gen_df.parquet') #save attributes to be added later"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e74d0125-ec88-43e9-96a0-9f68202fb0b5",
   "metadata": {},
   "source": [
    "#### Attach county"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6db27018-a14b-4a4c-8181-6386ab9a6430",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID20</th>\n",
       "      <th>COUNTY</th>\n",
       "      <th>P20</th>\n",
       "      <th>eq_P10</th>\n",
       "      <th>block_diff</th>\n",
       "      <th>block_net</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>21</td>\n",
       "      <td>31.0</td>\n",
       "      <td>-10.0</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10010201001001</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>34</td>\n",
       "      <td>30.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10010201001002</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>29</td>\n",
       "      <td>52.0</td>\n",
       "      <td>-23.0</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10010201001003</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>17</td>\n",
       "      <td>13.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10010201001004</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             ID20          COUNTY  P20  eq_P10  block_diff  block_net\n",
       "0  10010201001000  Autauga County   21    31.0       -10.0         -1\n",
       "1  10010201001001  Autauga County   34    30.0         4.0          1\n",
       "2  10010201001002  Autauga County   29    52.0       -23.0         -1\n",
       "3  10010201001003  Autauga County   17    13.0         4.0          1\n",
       "4  10010201001004  Autauga County    0     0.0         0.0          0"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_parquet('data/total_attr_gen_df.parquet')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ef7dcf13-ebe0-473a-89bd-87e97327de16",
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_points(row):\n",
    "    net = row[-1]\n",
    "    p20 = row[0]\n",
    "    p10 = row[1]\n",
    "    if net < 0:\n",
    "        return p20 + p10\n",
    "    else: return p20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "34b846d8-d09a-4713-bfa2-987b2b189a8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['points'] = df[['P20','eq_P10','block_net']].apply(calculate_points,axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "cc3891e4-3480-441f-8291-fad3f01320df",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID20</th>\n",
       "      <th>COUNTY</th>\n",
       "      <th>P20</th>\n",
       "      <th>eq_P10</th>\n",
       "      <th>block_diff</th>\n",
       "      <th>block_net</th>\n",
       "      <th>points</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>21</td>\n",
       "      <td>31.0</td>\n",
       "      <td>-10.0</td>\n",
       "      <td>-1</td>\n",
       "      <td>52.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10010201001001</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>34</td>\n",
       "      <td>30.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "      <td>34.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10010201001002</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>29</td>\n",
       "      <td>52.0</td>\n",
       "      <td>-23.0</td>\n",
       "      <td>-1</td>\n",
       "      <td>81.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10010201001003</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>17</td>\n",
       "      <td>13.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10010201001004</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8174950</th>\n",
       "      <td>721537506022011</td>\n",
       "      <td>Yauco Municipio</td>\n",
       "      <td>27</td>\n",
       "      <td>6.0</td>\n",
       "      <td>21.0</td>\n",
       "      <td>1</td>\n",
       "      <td>27.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8174951</th>\n",
       "      <td>721537506022012</td>\n",
       "      <td>Yauco Municipio</td>\n",
       "      <td>43</td>\n",
       "      <td>63.0</td>\n",
       "      <td>-20.0</td>\n",
       "      <td>-1</td>\n",
       "      <td>106.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8174952</th>\n",
       "      <td>721537506022013</td>\n",
       "      <td>Yauco Municipio</td>\n",
       "      <td>195</td>\n",
       "      <td>341.0</td>\n",
       "      <td>-146.0</td>\n",
       "      <td>-1</td>\n",
       "      <td>536.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8174953</th>\n",
       "      <td>721537506022014</td>\n",
       "      <td>Yauco Municipio</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8174954</th>\n",
       "      <td>721537506022015</td>\n",
       "      <td>Yauco Municipio</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8174955 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                    ID20           COUNTY  P20  eq_P10  block_diff  block_net  \\\n",
       "0         10010201001000   Autauga County   21    31.0       -10.0         -1   \n",
       "1         10010201001001   Autauga County   34    30.0         4.0          1   \n",
       "2         10010201001002   Autauga County   29    52.0       -23.0         -1   \n",
       "3         10010201001003   Autauga County   17    13.0         4.0          1   \n",
       "4         10010201001004   Autauga County    0     0.0         0.0          0   \n",
       "...                  ...              ...  ...     ...         ...        ...   \n",
       "8174950  721537506022011  Yauco Municipio   27     6.0        21.0          1   \n",
       "8174951  721537506022012  Yauco Municipio   43    63.0       -20.0         -1   \n",
       "8174952  721537506022013  Yauco Municipio  195   341.0      -146.0         -1   \n",
       "8174953  721537506022014  Yauco Municipio    0     0.0         0.0          0   \n",
       "8174954  721537506022015  Yauco Municipio    0     0.0         0.0          0   \n",
       "\n",
       "         points  \n",
       "0          52.0  \n",
       "1          34.0  \n",
       "2          81.0  \n",
       "3          17.0  \n",
       "4           0.0  \n",
       "...         ...  \n",
       "8174950    27.0  \n",
       "8174951   106.0  \n",
       "8174952   536.0  \n",
       "8174953     0.0  \n",
       "8174954     0.0  \n",
       "\n",
       "[8174955 rows x 7 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6265d762-7a4b-4e18-9383-7866d35b3246",
   "metadata": {},
   "outputs": [],
   "source": [
    "county2id = pickle.load(open('county2id.pkl','rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "8171f642-0b80-4a8a-a7a2-d1462ce61644",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Jefferson County      96055\n",
       "Los Angeles County    91626\n",
       "Cook County           85108\n",
       "Washington County     75565\n",
       "Montgomery County     66524\n",
       "Maricopa County       61427\n",
       "Franklin County       60891\n",
       "Orange County         60830\n",
       "Jackson County        60381\n",
       "Wayne County          59249\n",
       "Name: COUNTY, dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.COUNTY.value_counts().head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "a49f4f74-ca82-4ad8-a23c-f185f33fd84f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[df.points!=0].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "5f766288-a5d7-4f38-8c7c-5d09b04a7a75",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6200461"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['COUNTY'] == 'Maricopa County'].points.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "c064904a-7a6a-4d55-b64b-e2433d8b2ec7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['points'] = df['points'].astype('int32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "e6ab7c4c-f044-49e1-9c28-c8734ab87160",
   "metadata": {},
   "outputs": [],
   "source": [
    "counties = df[['COUNTY','points']].apply(lambda row: [county2id[row[0]]]*row[1],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "71b59e6e-5d8c-44c9-818f-3fce3cf12350",
   "metadata": {},
   "outputs": [],
   "source": [
    "gcounties = cudf.from_pandas(counties)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "ac88db07-26ee-4cc0-8102-90e6fa247fa6",
   "metadata": {},
   "outputs": [],
   "source": [
    "counties_list = gcounties.explode().reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "40e5fe64-2bb7-46bf-a741-470ad993f98e",
   "metadata": {},
   "outputs": [],
   "source": [
    "pickle.dump(counties_list,open('county_list.pkl','wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "ccc64011-00ed-43b5-ac0f-332136f6e180",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "504475979"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(counties_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20524a87-e169-4306-acde-2f17edd3721f",
   "metadata": {},
   "source": [
    "#### Continue making dataset for population gen"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "875e176a-633b-4bad-ba4c-09b2f97892b5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8174955\n"
     ]
    }
   ],
   "source": [
    "print(len(df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "dd1e348d-3753-4ab5-9da7-e0d67e0a7946",
   "metadata": {},
   "outputs": [],
   "source": [
    "df =df[df.points!=0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "330fe5a7-5e55-48e4-b88a-3b77a42c526a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6265163\n"
     ]
    }
   ],
   "source": [
    "print(len(df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "9d2dab42-4e75-4618-8166-958653847976",
   "metadata": {},
   "outputs": [],
   "source": [
    "gen_df = df[['ID20','STATE','points']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "3f0c26b3-e585-44b2-96a1-05261a4379db",
   "metadata": {},
   "outputs": [],
   "source": [
    "gen_df.to_csv('data/total_population_gen_df.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "e321a5b6-dfc0-4617-a74d-df5368e79d63",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6265163"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(gen_df)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
